{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the vocabulary list is:\n['cute', 'love', 'help', 'garbage', 'quit', 'I', 'problems', 'is', 'park', 'stop', 'flea', 'dalmation', 'licks', 'food', 'not', 'him', 'buying', 'posting', 'has', 'worthless', 'ate', 'to', 'maybe', 'please', 'dog', 'how', 'stupid', 'so', 'take', 'mr', 'steak', 'my']\npost0 vector=\n[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1]\nall post vector are:\n[[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0]]\n"
     ]
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*- #可以使用中文注释\n",
    "#朴素贝叶斯用于文本分类\n",
    "\n",
    "#以网站发帖留言，作为文本数据集合 \n",
    "def loadDataSet():\n",
    "    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\n",
    "                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\n",
    "                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\n",
    "                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\n",
    "                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\n",
    "                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]\n",
    "    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not\n",
    "    return postingList,classVec\n",
    "\n",
    "#根据所有的留言，构建词汇表（包含所有的单词token）\n",
    "def createVocabularyList(dataSet):\n",
    "    vocabularySet = set([])  #create empty set，利用set去重的特点\n",
    "    for document in dataSet:\n",
    "        vocabularySet = vocabularySet | set(document) #union of the two sets ,取或操作(并集)\n",
    "    return list(vocabularySet)\n",
    "\n",
    "#词序模型，将某个输入留言inputSet转化为关于词汇表的0/1向量\n",
    "def setOfWordsVec(vocabularyList, inputSet):\n",
    "    returnVec = [0]*len(vocabularyList)\n",
    "    for word in inputSet:\n",
    "        if word in vocabularyList:\n",
    "            returnVec[vocabularyList.index(word)] = 1 #留言中的某个单词在词汇表中，则为1，否则为0\n",
    "        else:\n",
    "            print \"the word: %s is not in my Vocabulary!\" % word\n",
    "    return returnVec\n",
    "\n",
    "\n",
    "#现在测试一下效果\n",
    "postingList,classVec = loadDataSet()\n",
    "vocabularyList = createVocabularyList(postingList)\n",
    "print \"the vocabulary list is:\\n\",vocabularyList\n",
    "returnVec = setOfWordsVec(vocabularyList,postingList[0])\n",
    "print \"post0 vector=\\n\",returnVec\n",
    "\n",
    "#现在将所有的留言，都转化为0/1词汇表特征向量，作为trainVec\n",
    "trainVec = []\n",
    "for post in postingList:\n",
    "    trainVec.append(setOfWordsVec(vocabularyList,post))\n",
    "\n",
    "print \"all post vector are:\\n\",trainVec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "p(y=1) =  0.5\ny=1,the word with the max Probability=(0.157895), is(stupid)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "#其实bayes的核心函数就：NaiveBayes0 + classify\n",
    "\n",
    "#bayes求得，特征向量的概率，p(w/y)\n",
    "def NaiveBayes0(trainMatrix,trainCategory):\n",
    "    numTrain = len(trainMatrix) #训练的文档个数\n",
    "    numWords = len(trainMatrix[0]) #词汇表的大小，即特征的大小\n",
    "    p1 = sum(trainCategory)/float(numTrain)  #分类1的概率p(y=1)，这里是二分类，所以，p0=1-p1\n",
    "    p0Num = np.zeros(numWords); p1Num = np.zeros(numWords)      #初始化为0\n",
    "    p0Denom = 0.0; p1Denom = 0.0                        \n",
    "    for i in range(numTrain):\n",
    "        if trainCategory[i] == 1:\n",
    "            p1Num += trainMatrix[i] #y=1条件下，统计某个单词出现的个数，用于计算p(w/y=1)\n",
    "            p1Denom += sum(trainMatrix[i]) #累计y=1的所有单词数量\n",
    "        else:\n",
    "            p0Num += trainMatrix[i] #y=0条件下，统计某个单词出现的个数，用于计算p(w/y=0)\n",
    "            p0Denom += sum(trainMatrix[i]) #累计y=0的所有单词数量\n",
    "    p1Vect = p1Num/float(p1Denom)     #p(w/y=1)   \n",
    "    p0Vect = p0Num/float(p0Denom)     #p(w/y=0)  \n",
    "    return p0Vect,p1Vect,p1\n",
    "\n",
    "#测试一下\n",
    "p0Vect,p1Vect,p1 = NaiveBayes0(trainVec,classVec)\n",
    "print \"p(y=1) = \",p1  #classVec = [0,1,0,1,0,1] ,so p1=0.5\n",
    "print \"y=1,the word with the max Probability=(%f), is(%s)\"\\\n",
    "      %(np.max(p1Vect),vocabularyList[np.argmax(p1Vect)])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#根据李航的《统计学习方法》，上面的估计概率的 方法叫做————“极大似然估计”\n",
    "#有两个缺点：\n",
    "# 1，有些概率=0，那么，后面的概率相乘，会影响计算结果，所以改进-----》李航书上：贝叶斯估计概率\n",
    "# 2,很多小数相乘，会下溢出，所以加上log\n",
    "\n",
    "#修改一下上面的函数\n",
    "\n",
    "#bayes求得，特征向量的概率，p(w/y)\n",
    "def NaiveBayes1(trainMatrix,trainCategory):\n",
    "    numTrain = len(trainMatrix) #训练的文档个数\n",
    "    numWords = len(trainMatrix[0]) #词汇表的大小，即特征的大小\n",
    "    p1 = sum(trainCategory)/float(numTrain)  #分类1的概率p(y=1)，这里是二分类，所以，p0=1-p1\n",
    "    p0Num = np.ones(numWords); p1Num = np.ones(numWords)      #change to 1 ,一般sigma = 1\n",
    "    p0Denom = 2.0; p1Denom = 2.0                        # change to 2.0 因为有两个类别\n",
    "    for i in range(numTrain):\n",
    "        if trainCategory[i] == 1:\n",
    "            p1Num += trainMatrix[i] #y=1条件下，统计某个单词出现的个数，用于计算p(w/y=1)\n",
    "            p1Denom += sum(trainMatrix[i]) #累计y=1的所有单词数量\n",
    "        else:\n",
    "            p0Num += trainMatrix[i] #y=0条件下，统计某个单词出现的个数，用于计算p(w/y=0)\n",
    "            p0Denom += sum(trainMatrix[i]) #累计y=0的所有单词数量\n",
    "    p1Vect = np.log(p1Num/float(p1Denom))         #change to log()\n",
    "    p0Vect = np.log(p0Num/float(p0Denom))          #change to log()\n",
    "    return p0Vect,p1Vect,p1\n",
    "\n",
    "#分类的函数，计算输入为x的条件下，属于每个类别的概率\n",
    "def classify(testVec, p0Vec, p1Vec, pClass1):\n",
    "    p1 = sum(testVec * p1Vec) + np.log(pClass1)    #因为是log,所以这里是求和以及+号操作\n",
    "    #p(y=1/w) = p(w/y=1) * p(y=1),注意这里需要乘上，testVec,过滤掉那些为0的特征的概率\n",
    "    p0 = sum(testVec * p0Vec) + np.log(1.0 - pClass1) #p(y=0) = 1 - p(y=1)\n",
    "    if p1 > p0: #选取最大的概率的类\n",
    "        return 1\n",
    "    else: \n",
    "        return 0\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['love', 'my', 'dalmation'] classified as:  0\n['stupid', 'garbage'] classified as:  1\n"
     ]
    }
   ],
   "source": [
    "#测试一下，看看对于一段留言的分类效果\n",
    "p0V,p1V,p1 = NaiveBayes1(trainVec,classVec)\n",
    "test0 = ['love', 'my', 'dalmation']\n",
    "testVec0 = setOfWordsVec(vocabularyList, test0)\n",
    "print test0,'classified as: ',classify(testVec0,p0V,p1V,p1)\n",
    "test1 = ['stupid', 'garbage']\n",
    "testVec1 = setOfWordsVec(vocabularyList, test1)\n",
    "print test1,'classified as: ',classify(testVec1,p0V,p1V,p1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#下面我们将贝叶斯模型用于垃圾邮件分类\n",
    "\n",
    "\n",
    "#词袋模型，需要统计某个token出现的次数\n",
    "def bagOfWordsVec(vocabularyList, inputSet):\n",
    "    returnVec = [0]*len(vocabularyList)\n",
    "    for word in inputSet:\n",
    "        if word in vocabularyList:\n",
    "            returnVec[vocabularyList.index(word)] += 1 #出现一次，累加一次\n",
    "    return returnVec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "#文档处理，\n",
    "def textParse(bigString):    #input is big string, #output is word list\n",
    "    listOfTokens = re.split(r'\\W*', bigString) #只需要字符和数字\n",
    "    return [tok.lower() for tok in listOfTokens if len(tok) > 2] #变成小写，过滤长度小于3的字符串 \n",
    "    \n",
    "def spamTest():\n",
    "    docList=[]; classList = []; fullText =[]\n",
    "    for i in range(1,26): #每个文件夹，有25个文件\n",
    "        wordList = textParse(open('bayes/email/spam/%d.txt' % i).read())\n",
    "        docList.append(wordList)\n",
    "        fullText.extend(wordList)\n",
    "        classList.append(1)\n",
    "        wordList = textParse(open('bayes/email/ham/%d.txt' % i).read())\n",
    "        docList.append(wordList)\n",
    "        fullText.extend(wordList)\n",
    "        classList.append(0)\n",
    "    vocabularyList = createVocabularyList(docList)#create vocabulary\n",
    "    trainingSet = range(50); testSet=[]   #create test set,这里其实是val set，而且只保存了下标\n",
    "    for i in range(10):\n",
    "        randIndex = int(np.random.uniform(0,len(trainingSet)))\n",
    "        testSet.append(trainingSet[randIndex])\n",
    "        del(trainingSet[randIndex])  #随机选取10个，并且从train中删除\n",
    "    trainMat=[]; trainClasses = []\n",
    "    for docIndex in trainingSet:#train the classifier (get probs) trainNB0\n",
    "        trainMat.append(bagOfWordsVec(vocabularyList, docList[docIndex]))\n",
    "        trainClasses.append(classList[docIndex])\n",
    "    p0V,p1V,pSpam = NaiveBayes1(trainMat,trainClasses) #学习得到的train的概率\n",
    "    errorCount = 0\n",
    "    for docIndex in testSet:        #classify the remaining items\n",
    "        wordVector = bagOfWordsVec(vocabularyList, docList[docIndex])\n",
    "        if classify(wordVector,p0V,p1V,pSpam) != classList[docIndex]: #用于test/val\n",
    "            errorCount += 1\n",
    "            print \"classification error\",docList[docIndex]\n",
    "    print 'the error rate is: ',float(errorCount)/len(testSet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the error rate is:  0.0\n"
     ]
    }
   ],
   "source": [
    "#因为是随机选取10个留存作为val set,所以，error rate 会变化\n",
    "spamTest()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "classification error ['yeah', 'ready', 'may', 'not', 'here', 'because', 'jar', 'jar', 'has', 'plane', 'tickets', 'germany', 'for']\nthe error rate is:  0.1\n"
     ]
    }
   ],
   "source": [
    "spamTest()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
